## this code uses raking to reweight our survey response data. It should be run first. 

rm(list=ls())
library(car)
library(ggplot2)
library(reshape)
library(survey)
library(tidyverse)
library(arm)
library(Cairo)
library(ggthemes)
library(modelr)
library(sparsereg)
library(viridis)
library(ggpubr)
library(zipcode)
library(data.table)

## set your working directory here. Make sure the root folder has a folder called "data" that contains all data inputs. 
setwd('~/Dropbox (Yale_FES)/ypccc_parrish/GND')

## read in ACS data
vars<-read.csv('ACS_17_5YR_B01001B_metadata.csv')

vars<-vars[c(3,seq(15,33,2),seq(45,63,2)),]
cols<-as.character(vars[,1])

black<-read.csv('ACS_17_5YR_B01001B_with_ann.csv')
black<-black[,cols]
black$race<-"Black/African American"

amindian<-read.csv('ACS_17_5YR_B01001C_with_ann.csv')[,cols]
amindian$race<-"Native American/Pacific Islander"

asian<-read.csv('ACS_17_5YR_B01001D_with_ann.csv')[,cols]
asian$race<-"Asian"

pacisl<-read.csv('ACS_17_5YR_B01001E_with_ann.csv')[,cols]
pacisl$race<-"Native American/Pacific Islander"

other<-read.csv('ACS_17_5YR_B01001F_with_ann.csv')[,cols]
other$race<-"Other"

twoplus<-read.csv('ACS_17_5YR_B01001G_with_ann.csv')[,cols]
twoplus$race<-'Other'

white<-read.csv('ACS_17_5YR_B01001H_with_ann.csv')[,cols]
white$race<-'White/Caucasian'

hisp<-read.csv('ACS_17_5YR_B01001I_with_ann.csv')[,cols]
hisp$race<-'Hispanic/Latino'

setdiff(as.character(asian[1,1:ncol(asian)-1]),
        as.character(white[1,1:ncol(white)-1]))
setdiff(as.character(twoplus[1,1:ncol(twoplus)-1]),
        as.character(white[1,1:ncol(white)-1]))
setdiff(as.character(other[1,1:ncol(other)-1]),
        as.character(white[1,1:ncol(white)-1]))
setdiff(as.character(pacisl[1,1:ncol(pacisl)-1]),
        as.character(white[1,1:ncol(white)-1]))
setdiff(as.character(asian[1,1:ncol(asian)-1]),
        as.character(white[1,1:ncol(white)-1]))
setdiff(as.character(amindian[1,1:ncol(amindian)-1]),
        as.character(white[1,1:ncol(white)-1]))
setdiff(as.character(black[1,1:ncol(black)-1]),
        as.character(white[1,1:ncol(white)-1]))

## all cols have same descriptions

counts<-do.call(rbind,list(white,black[2,],hisp[2,],twoplus[2,],
                           other[2,],asian[2,],pacisl[2,],amindian[2,]))
## group ages: 18-29=1
## 30-44=2
## 45-65=3
## 60+=4
## need groups to be 18-24; 25-34; 35=44; 45-54; 55-64; 65+ 
vars$age<-c('total',rep(c(rep("18-24",2),rep("25-34",2),"35-44",
                          "45-54","55-64",rep("65+",3)),2))
vars$gender<-c('total',rep(c(rep('Male',10),rep('Female',10))))
vars<-vars%>%
  dplyr::select(-Id)
counts[,1]
counts<-counts[2:nrow(counts),]
counts2<-counts%>%
  tidyr::gather(var,val,-race)%>%
  left_join(vars,by=c("var"="GEO.id"))%>%
  dplyr::mutate(#race=as.factor(race),
                # age=as.factor(age),
                # gender=as.factor(gender),
                val=as.numeric(val))%>%
  group_by(race,age,gender)%>%
  dplyr::summarise(val=sum(val))%>%
  dplyr::filter(age!="total")%>%
  ungroup()

# counts2<-data.table(counts2)

## expand rows
## this a time-consuming and clunky way to do this, but vectorized ways (
## commented out) all return an error that they run out of vector memory. 

names(counts2)[4]<-"Freq"
 
total<-sum(counts2$Freq)
counts2$total<-total
counts2<-counts2%>%mutate(prop=Freq/total)
sum(counts2$prop)

write.csv(counts2,'acs_strata.csv')

#########################
## Import CSV file from Qualtrics
## The file has two lines of string headers
#########################
df.gnd <- read.csv("GND Conjoint_July 15, 2019_15.06.csv",skip=0,head=T,
                   stringsAsFactors=FALSE)
## respondents accessing on mobile phones. 

df.gnd2 <- df.gnd[3:nrow(df.gnd),]

df.gnd2<-df.gnd2%>%
  dplyr::mutate(date=as.character(StartDate))%>%
  dplyr::mutate(date=substr(date,start=6,stop=10))%>%
  dplyr::mutate(date=as.numeric(gsub("-","",date)))
nrow(df.gnd2[df.gnd2$date<=621,])/nrow(df.gnd2)
head(df.gnd2$date)


## Read in Age metadata and merge
age.meta <- read.csv("Age Data - Qual2594-0320UnitedStates2 .csv")
age.meta$rid <- age.meta$Response.ID..rid # to faciliate join with df.gnd2
age.meta$rid <- as.character(age.meta$rid)

sum(age.meta$rid%in%df.gnd2$rid) # character case differences

df.gnd2$rid <- toupper(df.gnd2$rid)

sum(age.meta$rid%in%df.gnd2$rid)#now all match

#note that the age data is not merge into the same column as in the survey - seemed better to create new age variable after the fact
df.gnd2 <- left_join(df.gnd2, age.meta)
## new age var is AGE.2

## NOW DEVELOP DEFINITIVE GENDER, AGE, RACE variables for any responses (complete or not) possible
df.gnd2$gender_Q <- df.gnd2$gender


table(df.gnd2$gender)

# let's delete under 18 folks immediately because they are not allowed in our sample
under18 <- df.gnd2$rid[which(df.gnd2$age=="Under 18")]
df.gnd2 <- df.gnd2[-which(df.gnd2$rid%in%under18),]

# now it looks like we have the following age categories, 18-24,  25-34, 35-44, 45-54,  55-64, 65+
df.gnd2$age_Q <- NA
df.gnd2$age_Q[which(df.gnd2$age=="18-24")] <- "18-24"
df.gnd2$age_Q[which(df.gnd2$age=="25-34")] <- "25-34"
df.gnd2$age_Q[which(df.gnd2$age=="35-44")] <- "35-44"
df.gnd2$age_Q[which(df.gnd2$age=="45-54")] <- "45-54"
df.gnd2$age_Q[which(df.gnd2$age=="55-64")] <- "55-64"
df.gnd2$age_Q[which(df.gnd2$age=="65+")] <- "65+"

df.gnd2$age_Q[which(df.gnd2$AGE.2.>17&df.gnd2$AGE.2.<25)] <- "18-24"
df.gnd2$age_Q[which(df.gnd2$AGE.2.>24&df.gnd2$AGE.2.<35)] <- "25-34"
df.gnd2$age_Q[which(df.gnd2$AGE.2.>34&df.gnd2$AGE.2.<45)] <- "35-44"
df.gnd2$age_Q[which(df.gnd2$AGE.2.>44&df.gnd2$AGE.2.<55)] <- "45-54"
df.gnd2$age_Q[which(df.gnd2$AGE.2.>54&df.gnd2$AGE.2.<65)] <- "55-64"
df.gnd2$age_Q[which(df.gnd2$AGE.2.>64)] <- "65+"

table(df.gnd2$age_Q)
table(df.gnd2$age)
table(df.gnd2$AGE.2)


df.gnd2$race_Q <- NA

table(df.gnd2$race) # the 6 is a leftover from the previous labels and should now be a merged with Native American/Pacific Islander

df.gnd2$race_Q[which(df.gnd2$race=="Asian")] <- "Asian"
df.gnd2$race_Q[which(df.gnd2$race=="Black/African American")] <- "Black/African American"
df.gnd2$race_Q[which(df.gnd2$race=="Hispanic/Latino")] <- "Hispanic/Latino"
df.gnd2$race_Q[which(df.gnd2$race=="Native American/Pacific Islander")] <- "Native American/Pacific Islander"
df.gnd2$race_Q[which(df.gnd2$race==6)] <- "Native American/Pacific Islander"
df.gnd2$race_Q[which(df.gnd2$race=="Other")] <- "Other"
df.gnd2$race_Q[which(df.gnd2$race=="White/Caucasian")] <- "White/Caucasian"

table(df.gnd2$race_Q)

### recode income variable into 3 groups
df.gnd2$income<-NA
df.gnd2$income[df.gnd2$Q106=="Less than $25,000"|df.gnd2$Q106=="$25,000 to $34,999"|df.gnd2$Q106=="$35,000 to $49,999"]<-"<$50K"
df.gnd2$income[df.gnd2$Q106=="$50,000 to $74,999"|df.gnd2$Q106=="75,000 to $99,999"]<-"$50K-$100K"
df.gnd2$income[df.gnd2$Q106=="$100,000 to $149,999"|df.gnd2$Q106=="$150,000 or more"]<-"$100K+"
prop.table(table(df.gnd2$income))
table(df.gnd2$Q106)

## eliminate 3rd gender category
df.gnd2<-df.gnd2[df.gnd2$gender_Q=="Female"|df.gnd2$gender_Q=="Male",]


## mobile phone users: 
table(df.gnd2$mobile)
## screener 1
table(df.gnd2$Q116)
table(df.gnd2$Q117)


table(df.gnd2$Q116)
df.gnd.screen1<-df.gnd2%>%
  dplyr::filter(Q116=="Annual Government Spending" | 
                  Q116=="Economic Programs" | 
                  Q116=="Foreign Affairs" | 
                  Q116=="Social Programs")

table(df.gnd2$Q117)
df.gnd.screen2<-df.gnd2%>%
  dplyr::filter(Q117=="Annual Government Spending"| 
                  Q117=="Electricity Generation" | 
                  Q117=="Investment" | 
                  Q117=="Trade Policy")


## subset to completes by eliminating rows where zip is missing
df.gnd2$zip[which(df.gnd2$zip=="")] <- NA
df.gnd2<-df.gnd2[which(!is.na(df.gnd2$zip)),]
table(df.gnd2$gender_Q[which(!is.na(df.gnd2$zip))])
table(df.gnd2$gender_Q)

round(prop.table(table(df.gnd2$Q106))*100,0)



### NOW TRYING TO FIGURE OUT WHAT THE "GOOD QUOTAS" are per Qualtrics

table(df.gnd2$gc)
table(df.gnd2$age_Q[which(df.gnd2$gc==1)])
table(df.gnd2$race_Q[which(df.gnd2$gc==1)])
table(df.gnd2$gender_Q[which(df.gnd2$gc==1)])
table(df.gnd2$age_Q)

## load dataset of pop proportions
prop<-read.csv('acs_strata.csv',stringsAsFactors = FALSE)
table(prop$age)
unique(df.gnd2$age_Q)
table(df.gnd2$race_Q)
table(prop$race)
table(df.gnd2$gender_Q)

df.gnd2<-dplyr::mutate(df.gnd2,stratum=paste(race_Q,age_Q,gender_Q,sep="."))
df.gnd.screen1<-dplyr::mutate(df.gnd.screen1,stratum=paste(race_Q,age_Q,gender_Q,sep="."))
df.gnd.screen2<-dplyr::mutate(df.gnd.screen2,stratum=paste(race_Q,age_Q,gender_Q,sep="."))




names(prop)<-c("X","race_Q","age_Q","gender_Q",'Freq','total','prop')
prop<-prop%>%dplyr::mutate(stratum=paste(race_Q,age_Q,gender_Q,sep="."))
# prop2<-prop%>%
#   dplyr::mutate(prop=prop*1000000)%>%
#   dplyr::select(-c(Freq,total,X))
# prop<-prop%>%dplyr::select(race_Q,age_Q,gender_Q,Freq)

# prop<-prop%>%dplyr::select(race_Q,age_Q,gender_Q,Freq)
### prop needs to have a row for each respondent in the Freq columnn so we can create xtabs

propdist<-prop%>%
  dplyr::mutate(Freq=prop*nrow(df.gnd2))%>%
  dplyr::select(stratum,Freq)

setdiff(propdist$stratum,df.gnd2$stratum)
setdiff(propdist$stratum,df.gnd.screen1$stratum)
setdiff(propdist$stratum,df.gnd.screen2$stratum)

propdist<-propdist%>%
  dplyr::filter(stratum!=setdiff(propdist$stratum,df.gnd2$stratum))
propdist2<-propdist%>%
  dplyr::filter(stratum!=setdiff(propdist$stratum,df.gnd.screen1$stratum))
write.csv(propdist,file='acs_strata_asmarginal.csv')
#### create weights: sample without attention check failures ####
View(df.gnd2[grepl("NA",df.gnd2$stratum)==TRUE,])
df.gnd2<-df.gnd2[grepl("NA",df.gnd2$stratum)==FALSE,]
gnd.svy.unweighted <- svydesign(ids=~1, data=df.gnd2)
gnd.svy.rake <- rake(design = gnd.svy.unweighted,
                     sample.margins = list(~stratum),
                     population.margins = list(propdist))
summary(weights(gnd.svy.rake))

### trim weights to fit between 3 and .3
gnd.svy.rake.trim <- trimWeights(gnd.svy.rake, lower=0.3, upper=3,
                                 strict=TRUE) 

length(weights(gnd.svy.rake.trim))
head(gnd.svy.rake.trim$variables)[,1]

df.gnd2.weighted<-cbind(gnd.svy.rake.trim$variables,weights(gnd.svy.rake.trim))
write.csv(df.gnd2.weighted,'gnd_weighted.csv')

#### weights for data with screen fails included ####
#### create weights ####
View(df.gnd.screen1[grepl("NA",df.gnd.screen1$stratum)==TRUE,]) ## missing age var for many of these
df.gnd.screen1<-df.gnd.screen1[grepl("NA",df.gnd.screen1$stratum)==FALSE,]
gnd.svy.unweighted <- svydesign(ids=~1, data=df.gnd.screen1)
gnd.svy.rake <- rake(design = gnd.svy.unweighted,
                     sample.margins = list(~stratum),
                     population.margins = list(propdist2))
summary(weights(gnd.svy.rake))

### trim weights to fit between 3 and .3
gnd.svy.rake.trim <- trimWeights(gnd.svy.rake, lower=0.3, upper=3,
                                 strict=TRUE) 

length(weights(gnd.svy.rake.trim))
head(gnd.svy.rake.trim$variables)[,1]

df.gnd.screen1.weighted<-cbind(gnd.svy.rake.trim$variables,weights(gnd.svy.rake.trim))
write.csv(df.gnd.screen1.weighted,'gnd_screen1_weighted.csv')

#### new set of weights for 2nd conjoint ####
## drop respondents who had sponsor options for transportation variable
View(df.gnd2[1:10,c(91:105)]) ## out...9 is the sponsor var
df.gnd3<-df.gnd2%>%
  dplyr::filter(out.m_9!="Democrats"&out.m_9!="Democrats and some Republicans"&
                  out.n_9!="Democrats"&out.n_9!="Democrats and some Republicans"&
                  out.o_9!="Democrats"&out.o_9!="Democrats and some Republicans")
df.gnd.screen2<-df.gnd.screen2%>%
  dplyr::filter(out.m_9!="Democrats"&out.m_9!="Democrats and some Republicans"&
                  out.n_9!="Democrats"&out.n_9!="Democrats and some Republicans"&
                  out.o_9!="Democrats"&out.o_9!="Democrats and some Republicans")


View(df.gnd3[grepl("NA",df.gnd3$stratum)==TRUE,])
df.gnd3<-df.gnd3[grepl("NA",df.gnd3$stratum)==FALSE,]
View(df.gnd.screen2[grepl("NA",df.gnd.screen2$stratum)==TRUE,])
df.gnd.screen2<-df.gnd.screen2[grepl("NA",df.gnd.screen2$stratum)==FALSE,]
propdist<-propdist%>%
  dplyr::filter(stratum!=setdiff(propdist$stratum,df.gnd3$stratum))
# propdist2<-propdist%>%
#   dplyr::filter(stratum!=setdiff(propdist$stratum,df.gnd.screen2$stratum))
gnd.svy.unweighted <- svydesign(ids=~1, data=df.gnd3)
gnd.svy.rake <- rake(design = gnd.svy.unweighted,
                     sample.margins = list(~stratum),
                     population.margins = list(propdist))

summary(weights(gnd.svy.rake))

### trim weights to fit between 3 and .3
gnd.svy.rake.trim <- trimWeights(gnd.svy.rake, lower=0.3, upper=3,
                                 strict=TRUE) 

length(weights(gnd.svy.rake.trim))
head(gnd.svy.rake.trim$variables)[,1]

df.gnd3.weighted<-cbind(gnd.svy.rake.trim$variables,weights(gnd.svy.rake.trim))
write.csv(df.gnd3.weighted,'gnd_weighted_conjoint2.csv')

### for data that includes screener failures ###
gnd.svy.unweighted <- svydesign(ids=~1, data=df.gnd.screen2)
gnd.svy.rake <- rake(design = gnd.svy.unweighted,
                     sample.margins = list(~stratum),
                     population.margins = list(propdist))

summary(weights(gnd.svy.rake))

### trim weights to fit between 3 and .3
gnd.svy.rake.trim <- trimWeights(gnd.svy.rake, lower=0.3, upper=3,
                                 strict=TRUE) 

length(weights(gnd.svy.rake.trim))
head(gnd.svy.rake.trim$variables)[,1]

df.gnd.screen2.weighted<-cbind(gnd.svy.rake.trim$variables,weights(gnd.svy.rake.trim))
write.csv(df.gnd.screen2.weighted,'gnd_weighted_screen_conjoint2.csv')
